We need to build a model that will help the marketing team identify potential customers who are relatively more likely to subscribe term deposit and thus increase their hit ratio.
# importing required libraries
import math
import pandas as pd # For data processing, CSV file I/O (e.g. pd.read_csv())
import numpy as np # For Linear Algebra
import pandas_profiling as pf # Generates profile reports from pandas DataFrame
#importing Machine Learning parameters and classifiers
from sklearn import preprocessing # provides several common utility functions and transformer classes to change raw feature vectors into a representation more suitable for the downstream estimators
from sklearn.linear_model import LogisticRegression # Logistic Regression (aka logit, MaxEnt) classifier
from sklearn.tree import DecisionTreeClassifier #
from sklearn.feature_selection import RFE # Feature ranking with recursive feature elimination.
from sklearn import metrics # includes score functions, performance metrics and pairwise metrics and distance computations
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score # Build a text report showing the main classification metrics,Compute confusion matrix to evaluate the accuracy of a classification.
from sklearn.model_selection import train_test_split # splits data into random train and test subsets
from sklearn.preprocessing import LabelEncoder
#Ensemble classifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier
from sklearn.ensemble import VotingClassifier
#Visulization Libraries
from yellowbrick.classifier import ClassificationReport, ROCAUC
from IPython.display import Markdown, display
import seaborn as sns # data visualization lib based upon matplotlib
import matplotlib.pyplot as plt # is a state-based interface to matplotlib. It provides a MATLAB-like way of plotting
%matplotlib inline
# a magic function which sets the backend of matplotlib to the 'inline' backend
dfBankData = pd.read_csv('bank-full.csv')
Data types and description of the independent attributes which should include (name, meaning, range of values observed, central values (mean and median), standard deviation and quartiles, analysis of the body of distributions / tails, missing values, outliers
dfBankData.head(20)
dfBankData.shape
# Data types present in data
dfBankData.dtypes
dfBankData.describe().T
dfBankData.info()
print("Are there any null values ? : ", dfBankData.isnull().values.any())
print("Are there any na values ? : ", dfBankData.isna().values.any())
print("\n")
print("------------Checking for null ------------------")
print(dfBankData.isnull().sum())
print("------------Cheking for NA ---------------------")
print(dfBankData.isna().sum())
There no null or na values in dataset
# Get unique values for all colums
for col in dfBankData.columns:
print('Col Name {0}: Unique values {1}'.format(col, dfBankData[col].nunique()))
# Finding outliers using Inter-Quartile Range which difference between 75th and 27th percentiles
# IQR = Q₃ − Q₁
# https://en.wikipedia.org/wiki/Interquartile_range
cols =['age','balance','day', 'duration', 'campaign', 'pdays', 'previous']
outliersCols=[]
for col in cols:
q1 = dfBankData[col].quantile(0.25)
q3 = dfBankData[col].quantile(0.75)
iqr = q3-q1
lower_range = q1-(1.5*iqr)
upper_range = q3+(1.5*iqr)
isOutlier = (dfBankData.loc[(dfBankData[col] < lower_range)|(dfBankData[col] > upper_range)]).empty
if isOutlier:
display(Markdown("There are no outliers in {0}".format(col)))#("There are no outliers in {0}".format(col))
else:
print("There are outliers in {0}".format(col))
outliersCols.append(col)
print(outliersCols)
noOfRows=(int)(len(outliersCols)/2)
noOfCols=2
fig, axs = plt.subplots(nrows = noOfRows, ncols = noOfCols, figsize=(15,15))
colIndex = 0
for outlierCol in outliersCols:
sns.set()
sns.boxplot(dfBankData[outlierCol], ax = axs[math.floor(colIndex/noOfCols)][colIndex % noOfCols])
colIndex += 1
plt.figure()
dfBankData.hist(bins=20, figsize=(15,10), color='red')
plt.show()
cat_cols = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']
noOfRows=(int)(len(cat_cols)/3)
noOfCols=3
fig, axs = plt.subplots(nrows = noOfRows, ncols = noOfCols, figsize=(15,15))
colIndex = 0
plt.figure()
for catCol in cat_cols:
sns.countplot(y=catCol, data=dfBankData, ax = axs[math.floor(colIndex/3)][colIndex % 3])
colIndex += 1
pf.ProfileReport(dfBankData)
# Let's find categorical columns and numerical cols, we'll have to treat then differently
#Numerical cols
numericalCols = list(dfBankData.select_dtypes(exclude=['object']))
categoricalCols = list(dfBankData.select_dtypes(include=['object']))
print(" Numerical Columns are : ", numericalCols, "\n")
print(" Categorical Columns are : ", categoricalCols)
for nu_col in numericalCols:
print("******************************", nu_col, ":\n" )
print(dfBankData[nu_col].value_counts(), "\n")
# Let's find columns with unknown values and value counts
colsWithUnknownVals = []
for col in dfBankData.columns:
if 'unknown' in dfBankData[col].values:
colsWithUnknownVals.append([col, dfBankData[dfBankData[col].str.contains('unknown')][col].count()])
print("******************************", "\n")
print("Values Count in - {0} - having unknown value : ".format(col))
print(dfBankData[col].value_counts(), "\n")
print("************************************************", "\n")
print("Columns with unknown values : \n", colsWithUnknownVals)
print("************************************************", "\n")
And if no other criteria found 'unknown' can be replaced with first of categorical values or most ocurring categorical value of that column
Dealing with outliers
a.Bi-variate analysis between the predictor variables and target column. Comment on your findings in terms of their relationship and degree of relation if any.
Visualize the analysis using boxplots and pair plots, histograms or density curves. Select the most appropriate attributes.
b.Please provide comments in jupyter notebook regarding the steps you take and insights drawn from the plot
# Tagert column - 'Target'
# Numerical columns - Numerical Columns are : ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']
# Categorical Columns are : ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']
# sns pair plot
sns.pairplot(dfBankData);
cat_cols = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']
noOfRows=(int)(len(cat_cols)/3)
noOfCols=3
fig, axs = plt.subplots(nrows = noOfRows, ncols = noOfCols, figsize=(25,25))
colIndex = 0
plt.figure()
for catCol in cat_cols:
sns.countplot(x=catCol, hue='Target', data=dfBankData, ax = axs[math.floor(colIndex/3)][colIndex % 3])
colIndex += 1
plt.figure(figsize = (10,10))
sns.heatmap(dfBankData.corr(), annot = True)
plt.figure(figsize = (10,8))
sns.scatterplot(dfBankData['age'], dfBankData['balance'], hue = dfBankData['Target'])
plt.figure(figsize = (18,16))
dfBankData.hist();
plt.figure(figsize = (10,8))
sns.boxplot(x= dfBankData['age'], y=dfBankData['education'], hue=dfBankData['Target'])
plt.figure(figsize = (10,8))
sns.boxplot(x= dfBankData['duration'], y=dfBankData['month'], hue=dfBankData['Target'])
were contacted more recently('pdays') and frequently before the campaign('previous') by the bank though for a smaller duration.
are mostly in Management position
1.Ensure the attribute types are correct. If not, take appropriate actions.
2.Get the data model ready.
3.Transform the data i.e. scale / normalize if required
4.Create the training set and test set in ratioof 70:30
# There are mmany columns with type 'object'
categoricalCols = list(dfBankData.select_dtypes(include=['object']))
for catCol in categoricalCols:
dfBankData[catCol] =dfBankData[catCol].astype('category')
dfBankData.dtypes
dfBankData.groupby('Target').count()
def bucketing_balance(data):
data.loc[data['balance'] <= 72, 'balance'] = 1
data.loc[(data['balance'] > 72) & (data['balance'] <= 1428), 'balance' ] = 2
data.loc[(data['balance'] > 1428) & (data['balance'] <= 3462), 'balance' ] = 3
data.loc[(data['balance'] > 3462) & (data['balance'] <= 102127), 'balance' ] = 4
return data
bucketing_balance(dfBankData)
def bucketing_education(df):
df.loc[(df['age']>60) & (df['job']=='unknown'), 'job'] = 'retired'
df.loc[(df['education']=='unknown') & (df['job']=='management'), 'education'] = 'tertiary'
df.loc[(df['education']=='unknown') & (df['job']=='services'), 'education'] = 'secondary'
df.loc[(df['education']=='unknown') & (df['job']=='housemaid'), 'education'] = 'primary'
df.loc[(df['job'] == 'unknown') & (df['education']=='basic.4y'), 'job'] = 'blue-collar'
df.loc[(df['job'] == 'unknown') & (df['education']=='basic.6y'), 'job'] = 'blue-collar'
df.loc[(df['job'] == 'unknown') & (df['education']=='basic.9y'), 'job'] = 'blue-collar'
df.loc[(df['job']=='unknown') & (df['education']=='professional.course'), 'job'] = 'technician'
bucketing_education(dfBankData)
dfBankData['job'] = dfBankData.job.replace('unknown',dfBankData.job.mode()[0])
dfBankData['education'] = dfBankData.education.replace('unknown',dfBankData.education.mode()[0])
#putting age into bins
def bucketing_age(df):
df.loc[df["age"] < 30, 'age'] = 20
df.loc[(df["age"] >= 30) & (df["age"] <= 39), 'age'] = 30
df.loc[(df["age"] >= 40) & (df["age"] <= 49), 'age'] = 40
df.loc[(df["age"] >= 50) & (df["age"] <= 59), 'age'] = 50
df.loc[df["age"] >= 60, 'age'] = 60
bucketing_age(dfBankData)
dfBankData.head(10)
dfBankData['duration'] = dfBankData['duration'].apply(lambda n:n/60).round(2)
print('Rows count having call duration less than 10 Sec -\t',dfBankData[dfBankData.duration < 10/60]['duration'].count())
# drop rows where call duration was less than 10 seconds
#dropped 342 rows
dfBankData = dfBankData.drop(dfBankData[dfBankData.duration < 10/60].index, axis = 0, inplace = False)
# Transform the data i.e. scale / normalize if required
# Now we have to get dummy variables
cat_cols = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'poutcome']
dfBankData = pd.get_dummies(data = dfBankData, columns = cat_cols, drop_first = True)
dfBankData.head()
# We need to trasnform month column
label_encoder = LabelEncoder()
dfBankData['month'] = label_encoder.fit_transform(dfBankData['month'])
dfBankData['Target'] = label_encoder.fit_transform(dfBankData['Target'])
dfBankData.head(10)
plt.figure(figsize = (20,18))
sns.heatmap(dfBankData.corr(), annot = True)
# Create the training set and test set in ratioof 70:30
X = dfBankData.drop('Target', axis = 1)
y = dfBankData['Target']
X_train, X_test, y_train, y_test= train_test_split(X,y, test_size=0.3, random_state=0)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)
1.First create model susing Logistic Regression and Decision Tree algorithm. Note the model performanceby using different matrices. Use confusion matrix to evaluate class level metrics i.e. Precision/Recall. Also reflect the accuracy and F1scoreof themodel.
2.Build the ensemble models(Bagging and Boosting)and note the model performanceby using different matrices. Use same metrics as in above model. (at least 3 algorithms)
mLogReg = LogisticRegression()
mLogReg.fit(X_train, y_train)
y_pred_mLogReg = mLogReg.predict(X_test)
print("Logistirc Regression Model score on training data: {} \n".format(mLogReg.score(X_train, y_train)))
print("Logistirc Regression Model score on test data: {} \n".format(mLogReg.score(X_test, y_test)))
print("Confusion Matrics of Logistirc Regression Model : \n \n", confusion_matrix(y_test, y_pred_mLogReg))
roc = ROCAUC(mLogReg, macro=True, micro=False, per_class=False)
roc.fit(X_train, y_train)
roc.score(X_test, y_test)
roc.show();
print(classification_report(y_test, y_pred_mLogReg))
modelDecisionTree_entropy = DecisionTreeClassifier(criterion='gini', random_state = 100,)
modelDecisionTree_entropy.fit(X_train, y_train)
y_pred_dt_entropy = modelDecisionTree_entropy.predict(X_test)
print("Decision Tree Model score on training data: {} \n".format(modelDecisionTree_entropy.score(X_train, y_train)))
print("Decision Tree Model score on test data: {} \n".format(modelDecisionTree_entropy.score(X_test, y_test)))
print("Confusion Matrics of Decision Tree Model : \n \n", confusion_matrix(y_test, y_pred_dt_entropy))
roc = ROCAUC(modelDecisionTree_entropy, macro=True, micro=False, per_class=False)
roc.fit(X_train, y_train)
roc.score(X_test, y_test)
roc.show();
print(classification_report(y_test, y_pred_dt_entropy))
# pruning the decision tree
modelDT_entropy_pruned = DecisionTreeClassifier(criterion='gini', max_depth = 3, random_state = 100, min_samples_leaf = 5)
modelDT_entropy_pruned.fit(X_train, y_train)
y_pred_dt_pruned = modelDT_entropy_pruned.predict(X_test)
print("Pruned Decision Tree Model score on training data: {} \n".format(modelDT_entropy_pruned.score(X_train, y_train)))
print("Pruned Decision Tree Model score on test data: {} \n".format(modelDT_entropy_pruned.score(X_test, y_test)))
print("\n")
print("Pruned Confusion Matrics of Decision Tree Model : \n \n", confusion_matrix(y_test, y_pred_dt_pruned))
print(classification_report(y_test, y_pred_dt_pruned))
roc = ROCAUC(modelDT_entropy_pruned, macro=True, micro=False, per_class=False)
roc.fit(X_train, y_train)
roc.score(X_test, y_test)
roc.show();
accuracies = {}
from sklearn.ensemble import BaggingClassifier
model_bgcl = BaggingClassifier(n_estimators = 200,max_samples= .7, bootstrap=True, oob_score=True, random_state = 22)
model_bgcl.fit(X_train, y_train)
model_bgcl_predict = model_bgcl.predict(X_test)
acc_model_bgcl = accuracy_score(y_test, model_bgcl_predict) * 100
accuracies['BaggingClassifer'] = acc_model_bgcl
accuracies['BaggingClassifer']
roc = ROCAUC(model_bgcl, macro=True, micro=False, per_class=False)
roc.fit(X_train, y_train)
roc.score(X_test, y_test)
roc.show();
print(classification_report(y_test, model_bgcl_predict))
from sklearn.ensemble import GradientBoostingClassifier
gbcl = GradientBoostingClassifier(n_estimators = 200,learning_rate = 0.1, random_state = 22)
gbcl = gbcl.fit(X_train, y_train)
gbcl_predict = gbcl.predict(X_test)
acc_gbcl = accuracy_score(y_test, gbcl_predict) * 100
accuracies['GradientBoostingClassifier'] = acc_gbcl
accuracies['GradientBoostingClassifier']
roc = ROCAUC(gbcl, macro=True, micro=False, per_class=False)
roc.fit(X_train, y_train)
roc.score(X_test, y_test)
roc.show();
print(classification_report(y_test, gbcl_predict))
from sklearn.ensemble import AdaBoostClassifier
abcl = AdaBoostClassifier(n_estimators = 200, learning_rate = 0.1, random_state = 22)
abcl = abcl.fit(X_train, y_train)
abcl_predict = abcl.predict(X_test)
acc_abcl = accuracy_score(y_test, abcl_predict) *100
accuracies['ADA'] = acc_abcl
accuracies['ADA']
roc = ROCAUC(abcl, macro=True, micro=False, per_class=False)
roc.fit(X_train, y_train)
roc.score(X_test, y_test)
roc.show();
print(classification_report(y_test, abcl_predict))
accuracies
print("classification_report BaggingClassifer \n ", classification_report(y_test, model_bgcl_predict))
print("classification_report GradientBoostingClassifier \n ", classification_report(y_test, gbcl_predict))
print("classification_report AdaBoostClassifier \n",classification_report(y_test, abcl_predict))
Based on the above metrics considering recall, f1 roc_auc_score, the best algorithm to use in this scenario is BaggingClassifer (comparision between GradientBoostingClassifier & BaggingClassifer).
Following are the reasons for it:
* It has the highest score for
* It has the second best AUC for test data
* It has the best F1_yes score, i.e F1 score for the Target value of Yes or 1